import numpy as np
import pandas as pd
df=pd.read_csv(r"C:\Users\HOME\Downloads\archive (1)\PS_20174392719_1491204439457_log.csv")
df
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | PAYMENT | 9839.64 | C1231006815 | 170136.00 | 160296.36 | M1979787155 | 0.00 | 0.00 | 0 | 0 |
| 1 | 1 | PAYMENT | 1864.28 | C1666544295 | 21249.00 | 19384.72 | M2044282225 | 0.00 | 0.00 | 0 | 0 |
| 2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.00 | 0.00 | C553264065 | 0.00 | 0.00 | 1 | 0 |
| 3 | 1 | CASH_OUT | 181.00 | C840083671 | 181.00 | 0.00 | C38997010 | 21182.00 | 0.00 | 1 | 0 |
| 4 | 1 | PAYMENT | 11668.14 | C2048537720 | 41554.00 | 29885.86 | M1230701703 | 0.00 | 0.00 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6362615 | 743 | CASH_OUT | 339682.13 | C786484425 | 339682.13 | 0.00 | C776919290 | 0.00 | 339682.13 | 1 | 0 |
| 6362616 | 743 | TRANSFER | 6311409.28 | C1529008245 | 6311409.28 | 0.00 | C1881841831 | 0.00 | 0.00 | 1 | 0 |
| 6362617 | 743 | CASH_OUT | 6311409.28 | C1162922333 | 6311409.28 | 0.00 | C1365125890 | 68488.84 | 6379898.11 | 1 | 0 |
| 6362618 | 743 | TRANSFER | 850002.52 | C1685995037 | 850002.52 | 0.00 | C2080388513 | 0.00 | 0.00 | 1 | 0 |
| 6362619 | 743 | CASH_OUT | 850002.52 | C1280323807 | 850002.52 | 0.00 | C873221189 | 6510099.11 | 7360101.63 | 1 | 0 |
6362620 rows × 11 columns
data=df.copy()
print(data.isnull().sum())
step 0 type 0 amount 0 nameOrig 0 oldbalanceOrg 0 newbalanceOrig 0 nameDest 0 oldbalanceDest 0 newbalanceDest 0 isFraud 0 isFlaggedFraud 0 dtype: int64
# Exploring transaction type
print(data.type.value_counts())
CASH_OUT 2237500 PAYMENT 2151495 CASH_IN 1399284 TRANSFER 532909 DEBIT 41432 Name: type, dtype: int64
type = data["type"].value_counts()
transactions = type.index
quantity = type.values
import plotly.express as px
figure = px.pie(data,
values=quantity,
names=transactions,hole = 0.5,
title="Distribution of Transaction Type")
figure.show()
# Checking correlation
correlation = data.corr()
print(correlation["isFraud"].sort_values(ascending=False))
C:\Users\HOME\AppData\Local\Temp\ipykernel_1940\3404805963.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
isFraud 1.000000 amount 0.076688 isFlaggedFraud 0.044109 step 0.031578 oldbalanceOrg 0.010154 newbalanceDest 0.000535 oldbalanceDest -0.005885 newbalanceOrig -0.008148 Name: isFraud, dtype: float64
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2,
"CASH_IN": 3, "TRANSFER": 4,
"DEBIT": 5})
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
print(data.head())
step type amount nameOrig oldbalanceOrg newbalanceOrig \
0 1 2 9839.64 C1231006815 170136.0 160296.36
1 1 2 1864.28 C1666544295 21249.0 19384.72
2 1 4 181.00 C1305486145 181.0 0.00
3 1 1 181.00 C840083671 181.0 0.00
4 1 2 11668.14 C2048537720 41554.0 29885.86
nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 M1979787155 0.0 0.0 No Fraud 0
1 M2044282225 0.0 0.0 No Fraud 0
2 C553264065 0.0 0.0 Fraud 0
3 C38997010 21182.0 0.0 Fraud 0
4 M1230701703 0.0 0.0 No Fraud 0
# splitting the data
from sklearn.model_selection import train_test_split
x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data[["isFraud"]])
# training a machine learning model
from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))
0.999732814469511
# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print(model.predict(features))
['Fraud']